MDS clustering

Pierre Guilmin

November 2018

Requirements

Built with R version 3.5.1

  • Nicola Roberts HDP R package (GitHub link) which implements the hierarchical Dirichlet process:
    devtools::install_github("nicolaroberts/hdp", build_vignettes = TRUE)
    
  • Various R libraries:
    • ggplot2
    • reshape2
    • gridExtra
    • tibble
    • survminer
    • survival
    • glmnet
    • RColorBrewer
    • IRdisplay

In [1]:
library('ggplot2')
library('reshape2')
library('gridExtra')
library('tibble')
library('survminer')
library('survival')
library('glmnet')
library('RColorBrewer')
library('IRdisplay')

source('utils/tools.R')     # custom tools function
source('utils/hdp_tools.R') # hdp related functions

theme_set(theme_minimal())

# set jupyer notebook parameters
options(repr.plot.res        = 100, # set a medium-definition resolution for the jupyter notebooks plots (DPI)
        repr.matrix.max.rows = 200, # set the maximum number of rows displayed
        repr.matrix.max.cols = 200) # set the maximum number of columns displayed
Loading required package: ggpubr
Loading required package: magrittr
Loading required package: Matrix
Loading required package: foreach
Loaded glmnet 2.0-16

Run citation('hdp') for citation instructions,
    and file.show(system.file('LICENSE', package='hdp')) for license details.

Get data

In [2]:
# get cytogenetics data
dd_cyto <- read.table("data/dd_cyto_cut15.tsv", sep = '\t', stringsAsFactors = FALSE, header = TRUE)

# get mutation data
dd_mutation <- read.table("data/dd_mutation_hotspot_cut10.tsv", sep = '\t', stringsAsFactors = FALSE, header = TRUE)

# merge cytogenetics and mutation data
cat(all(rownames(dd_mutation) == rownames(dd_cyto)), '\n\n') # check if same row ordering
dd_all <- cbind(dd_mutation, dd_cyto)

print_size(dd_all)
head(dd_all, 3)
TRUE 

Size of dd_all: 3300 x 157
ARID1AARID2ASXL1ASXL2ATRXBCORBCORL1BRAFBRCC3CALRCBLCDKN1BCDKN2ACEBPACREBBPCSF1RCSF3RCSNK1A1CTCFCUX1DDX23DDX4DDX41DDX54DHX33DICER1DNMT3ADNMT3BEEDEGFREP300ETNK1ETV6EZH2FAM175AFLT3GATA1GATA2GNASGNB1HIPK2IDH1IDH2_140IDH2_172IRF1JAK2JARID2KDM5CKDM6AKITKMT2AKMT2CKMT2DKRASLUC7L2MGAMPLMYCNF1NFE2NIPBLNOTCH1NOTCH2NPM1NRASPAPD5PHF6PHIPPIK3CAPPM1DPRPF40BPRPF8PTPN11PTPRFRAD21RAD50RASGRF1RB1ROBO1ROBO2RRASRUNX1SETBP1SETD2SF3B1SH2B3SMC1ASMC3SMG1SPRED2SRCAPSRSF2STAG2STAT3SUZ12TERTTET2TP53U2AF1_157U2AF1_34U2AF2WT1YLPM1ZBTB33ZMYM3ZNF318ZRSR2del5qplus8del7delYdel20qmardel7qdel12pdel11qdel18del17pdel5plus21del17del13del3pplus1qdel16del20plus11qplus1pdel16qdel12del1pplus19del13qdel21del9qplus1plus22plus17qdel4qplus11plus14del3del3qplus17pr_3_3del11pdel14del15delXplus9r_9_9plus13plus15r_1_7del22ringWGA
I-H-132697-T1-1-D1-10010000000000000100000000010000000010000000000000000000000000000000000000000000001000000001000001010000000000001000000000000000000000000000000000000000000000
I-H-132698-T1-1-D1-10010000000000010000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000
I-H-116889-T2-1-D1-10000000000000000000000000000000001000000000000000001000000000000000000000000000000001000000000000000010000000000000000000000000000000000000000001000000000000

164 patients don't have any event:

In [3]:
nrow(dd_all[apply(dd_all, 1, function(x) sum(x)) == 0,])
164

Run HDP

We run HDP using multiple independent posterior sampling chains and a initial number of raw cluster of 15 (initcc = 15).
All the functions used are defined in utils/hdp_tools.R and make use of Nicola Roberts HDP R package (see notebook header).
To follow the HDP terminology, in the rest of the notebook the term component is used for cluster and the term category is used for one gene/cytogenetic feature (ie we have 157 categories).

In [4]:
# initialise hdp
hdp <- initialise_hdp(dd_all)
Initialise HDP on a 3300 x 157 dataframe
  → create HDP structure... done!
  → add DP node for each patient... done!
  → assign the data to the nodes... done!
In [5]:
# run multiple independent posterior sampling chains with different random seeds

number_of_chains <- 3
chain_list <- vector('list', number_of_chains)

for (i in 1:number_of_chains) {
    seed <- i * 100
    print_and_flush(sprintf('### Experiment %d (seed = %d) ###\n', i, seed))
    
    # run single hdp chain
    chain_list[[i]] <- activate_and_run_hdp(hdp,
                                            initcc = 15,
                                            burnin = 1000,
                                            n      = 300,
                                            space  = 20,
                                            seed   = seed)
    print_and_flush('\n')
}

multi_output <- hdp_multi_chain(chain_list)
multi_output
### Experiment 1 (seed = 100) ###
Activate HDP nodes and run posterior sampling
  → activate HDP nodes... done!
  → run posterior sampling...
[1] "1000 burn-in iterations in 0.2 mins"

### Experiment 2 (seed = 200) ###
Activate HDP nodes and run posterior sampling
  → activate HDP nodes... done!
  → run posterior sampling...
[1] "1000 burn-in iterations in 0.2 mins"

### Experiment 3 (seed = 300) ###
Activate HDP nodes and run posterior sampling
  → activate HDP nodes... done!
  → run posterior sampling...
[1] "1000 burn-in iterations in 0.2 mins"

Object of class hdpSampleMulti 
 Number of chains: 3 
 Total posterior samples: 900 
 Components: NO. Run hdp_extract_components 
 ----------
 Final hdpState from first chain: 
Object of class hdpState 
 Number of DP nodes: 3301 
 Index of parent DP: 0 1 1 1 1 1 1 1 1 1 ...
 Number of data items per DP: 0 9 3 5 10 1 0 4 3 6 ...
 Index of conparam per DP: 1 1 1 1 1 1 1 1 1 1 ...
 Conparam hyperparameters and current value:
           Shape Rate    Value
Conparam 1     1    1 1.602232
 Number of data categories: 157 
 Number of clusters: 14 
 Initialised with 15 clusters, using random seed 100 
In [6]:
# assess quality of posterior sampling chain
for (experiment in chains(multi_output))
    plot_posterior_sampling_chain_quality(experiment, 15, 5)
In [7]:
# extract and plot components
multi_output <- extract_components(multi_output)
plot_components_size(multi_output)
Extract HDP components from posterior sampling
  → extract components... done!
* 11 components found

WARNING: component 0 is a "garbage" component, not a real component.

In [8]:
plot_category_distribution_by_component(multi_output, colnames(dd_all))
In [9]:
# get top 7 categories for each component
get_top_categories_by_component(multi_output, colnames(dd_all), top_number = 7)
component_0component_1component_2component_3component_4component_5component_6component_7component_8component_9component_10component_11
ASXL1 TET2 TP53 ASXL1 SF3B1 ASXL1 del5q ASXL1 U2AF1_34 DNMT3A plus21 plus1q
U2AF1_157SRSF2 del5q SRSF2 TET2 SETBP1 SF3B1 EZH2 BCOR NPM1 plus8 del7q
TET2 ASXL1 mar STAG2 DNMT3A U2AF1_157DNMT3A TET2 DNMT3A FLT3 plus9 r_1_7
KMT2D SF3B1 del7 RUNX1 delY del7 TP53 RUNX1 del20q WT1 plus14 ETNK1
r_9_9 CBL del7q TET2 KMT2C ETV6 plus8 ZRSR2 BCORL1 PTPN11 TP53 plus1
PHF6 CUX1 del12p IDH2_140 KMT2D RUNX1 CSNK1A1 U2AF1_157RUNX1 NRAS plus19 ZMYM3
STAT3 DDX41 del18 BCOR ZBTB33 PTPN11 TET2 PHF6 NRAS KDM6A plus1 GATA1

Study HDP components

Let's get the probability prediction for each patient and each component. Notice that the 164 patients with no mutation/cytogenetics are NA rows.

In [10]:
dd_predicted <- get_prediction_result_dataframe(multi_output, dd_all)
head(dd_predicted)
Number of components: 11
Number of NA rows   : 164
component_0component_1component_2component_3component_4component_5component_6component_7component_8component_9component_10component_11predicted_componentmax_proba
0.01691358 0.04098765 0.009382716 0.07901235 0.16148148 0.37283951 0.07950617280.08716049 0.101728395 0.05086420 0 0.00012345685 0.3728395
0.11259259 0.03074074 0.000000000 0.16444444 0.06370370 0.11111111 0.00074074070.48851852 0.014814815 0.01333333 0 0.00000000007 0.4885185
0.01044444 0.02488889 0.002888889 0.02133333 0.40977778 0.36800000 0.02444444440.07177778 0.008444444 0.05777778 0 0.00022222224 0.4097778
0.01444444 0.11433333 0.071888889 0.04766667 0.04400000 0.03044444 0.00333333330.64288889 0.006555556 0.01188889 0 0.01255555567 0.6428889
0.02555556 0.19777778 0.000000000 0.33333333 0.02555556 0.18222222 0.00000000000.23555556 0.000000000 0.00000000 0 0.00000000003 0.3333333
NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaNNaN NaN
In [11]:
n_components <- ncol(dd_predicted) - 2 - 1 # without columns 'component_0', 'predicted_component' and 'max_proba'
n_components
11

Component size in the dataset:

In [12]:
get_table(dd_predicted[,'predicted_component'])
valuescountfreq
1 665 20.2%
4 654 19.8%
3 463 14%
2 355 10.8%
6 287 8.7%
5 243 7.4%
7 205 6.2%
NaN 164 5%
8 151 4.6%
9 75 2.3%
10 20 0.6%
11 18 0.5%
-- total --3300 100%
In [13]:
plot_continous_feature_by_components <- function(data, feature_name, scale_y) {    
    set_notebook_plot_size(25, 7)
    
    myLabeller <- function(x) {
        x$predicted_component <- apply(x, 1, function(s) sprintf('Component %s (n = %d)', s, nrow(data[data$predicted_component == s,])))
        return (x)
    }
        
    n_initial <- nrow(data)
    data <- data[!is.na(data[feature_name]),]
                                       
    p1 <- ggplot(data) + geom_boxplot(aes_string('predicted_component', feature_name, fill = 'predicted_component'), alpha = 0.7, notch = TRUE) + scale_fill_brewer(palette = 'Spectral', na.value = 'grey') +
              ggtitle(sprintf('%s density by component\n(removed %d NA values)', feature_name, n_initial - nrow(data))) +  theme(plot.title = element_text(size = 18, face = "bold"))
                                       
    p2 <- ggplot(data) + geom_violin(aes_string('predicted_component', feature_name, fill = 'predicted_component'), alpha = 0.7) + scale_fill_brewer(palette = 'Spectral', na.value = 'grey') +
              ggtitle(' \n ') +  theme(plot.title = element_text(size = 18, face = "bold"))

    p3 <- ggplot(data) + geom_density(aes_string(feature_name, fill = 'predicted_component'), alpha = 0.7) + scale_fill_brewer(palette = 'Spectral', na.value = 'grey') +
              facet_wrap(~ predicted_component, ncol = 4, labeller = myLabeller) +
              ggtitle(' \n ') +  theme(plot.title = element_text(size = 18, face = "bold"))
                                       
    if (scale_y) {
        p1 <- p1 + scale_y_sqrt()
        p2 <- p2 + scale_y_sqrt()
    }
                                       
    grid.arrange(p1, p2, p3, ncol = 3)
}

#suppressWarnings(plot_continous_feature_by_components(dd_clinical, 'HB', TRUE))
In [14]:
plot_assignement_probability_by_component(dd_predicted)
In [15]:
plot_assignement_probability_distribution_by_component(dd_predicted, 25, 10)

Components heatmap

In [16]:
# merge patient information dataframe with predicted probability and component
dd_clustered <- cbind(dd_all, dd_predicted)
dd_clustered <- rownames_to_column(dd_clustered, var = 'patient_key')

# sort by decreseasing component size and increasing assignemnt probability
categories_table <- get_table(dd_clustered[,'predicted_component'], add_total_count = FALSE)
dd_clustered$predicted_component <- factor(dd_clustered$predicted_component, levels = categories_table$values)
dd_clustered <- dd_clustered[order(dd_clustered$predicted_component, dd_clustered$max_proba),]

print_size(dd_clustered)
head(dd_clustered, 3)
Size of dd_clustered: 3300 x 172
patient_keyARID1AARID2ASXL1ASXL2ATRXBCORBCORL1BRAFBRCC3CALRCBLCDKN1BCDKN2ACEBPACREBBPCSF1RCSF3RCSNK1A1CTCFCUX1DDX23DDX4DDX41DDX54DHX33DICER1DNMT3ADNMT3BEEDEGFREP300ETNK1ETV6EZH2FAM175AFLT3GATA1GATA2GNASGNB1HIPK2IDH1IDH2_140IDH2_172IRF1JAK2JARID2KDM5CKDM6AKITKMT2AKMT2CKMT2DKRASLUC7L2MGAMPLMYCNF1NFE2NIPBLNOTCH1NOTCH2NPM1NRASPAPD5PHF6PHIPPIK3CAPPM1DPRPF40BPRPF8PTPN11PTPRFRAD21RAD50RASGRF1RB1ROBO1ROBO2RRASRUNX1SETBP1SETD2SF3B1SH2B3SMC1ASMC3SMG1SPRED2SRCAPSRSF2STAG2STAT3SUZ12TERTTET2TP53U2AF1_157U2AF1_34U2AF2WT1YLPM1ZBTB33ZMYM3ZNF318ZRSR2del5qplus8del7delYdel20qmardel7qdel12pdel11qdel18del17pdel5plus21del17del13del3pplus1qdel16del20plus11qplus1pdel16qdel12del1pplus19del13qdel21del9qplus1plus22plus17qdel4qplus11plus14del3del3qplus17pr_3_3del11pdel14del15delXplus9r_9_9plus13plus15r_1_7del22ringWGAcomponent_0component_1component_2component_3component_4component_5component_6component_7component_8component_9component_10component_11predicted_componentmax_proba
1007E-H-110762-T1-1-D1-10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.01916667 0.1994444 0.07333333 0.11666667 0.14055556 0.16805556 0.060555556 0.05444444 0.0075000000 0.0319444444 0.09805556 0.03027778 1 0.1994444
941E-H-102643-T1-1-D1-10 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.01185185 0.2118519 0.19814815 0.17888889 0.07888889 0.03777778 0.135185185 0.13148148 0.0007407407 0.0000000000 0.01518519 0.00000000 1 0.2118519
936E-H-102628-T1-1-D1-10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.03634921 0.2134921 0.04238095 0.06222222 0.20238095 0.10476190 0.007460317 0.15269841 0.0287301587 0.0004761905 0.05936508 0.08968254 1 0.2134921
In [17]:
# compute column separators position
col_sep <- c(cumsum(categories_table$count))
col_sep <- col_sep

# compute row separators position
row_sep <- c(58, 6, 15, 8, 11, 7, 2, 50)
row_sep_cum <- c(cumsum(row_sep))
row_sep_cum <- row_sep_cum[-length(row_sep_cum)]

# compute row colors vector
sep_color <- rev(c(rgb(180/255, 142/255, 84/255),
                   rgb(146/255, 187/255, 105/255),
                   rgb(233/255, 72/255 , 157/255),
                   rgb(72/255 , 169/255, 137/255),
                   rgb(226/255, 146/255, 102/255),
                   rgb(140/255, 137/255, 190/255),
                   rgb(238/255, 198/255, 116/255),
                   rgb(0/255, 0/255, 0/255)))
row_colors <- c()
for (i in 1:length(row_sep))
    row_colors <- c(row_colors, rep(sep_color[i], row_sep[i]))
In [18]:
# manual gene grouping
col_order <- c(
'ARID1A', 'ARID2', 'BCORL1', 'BRCC3', 'CALR', 'CDKN1B', 'CDKN2A', 'CEBPA', 'CREBBP', 'CSF1R', 'CSF3R', 'CSNK1A1', 'DDX23', 'DDX4', 'DDX41',
'DDX54', 'DHX33', 'DICER1', 'DNMT3B', 'EED', 'EGFR', 'ETNK1', 'FAM175A', 'HIPK2', 'JAK2', 'JARID2', 'KDM5C', 'KMT2C', 'KMT2D', 'LUC7L2',
'MGA', 'MPL', 'NFE2', 'NIPBL', 'NOTCH2', 'NPM1', 'PAPD5', 'PHIP', 'PIK3CA', 'PRPF40B', 'PTPRF', 'RAD50', 'RASGRF1', 'RB1', 'ROBO1', 'ROBO2',
'RRAS', 'SETD2', 'SMG1', 'SPRED2', 'SRCAP', 'STAT3', 'SUZ12', 'TERT', 'YLPM1', 'ZBTB33', 'ZMYM3', 'ZNF318',

'ASXL1', 'BCOR', 'EZH2', 'RAD21', 'STAG2', 'ASXL2',

'ATRX', 'CUX1', 'GNAS', 'IRF1', 'KDM6A', 'KMT2A', 'NOTCH1', 'PHF6', 'SETBP1', 'WT1', 'SH2B3', 'SMC1A', 'EP300', 'SMC3', 'GNB1',

'BRAF', 'CBL', 'FLT3', 'KIT', 'KRAS', 'NF1', 'NRAS', 'PTPN11',

'DNMT3A', 'IDH1', 'IDH2_140', 'IDH2_172', 'TET2', 'ETV6', 'GATA1', 'GATA2', 'RUNX1', 'CTCF', 'MYC',

'SF3B1', 'SRSF2', 'U2AF1_157', 'U2AF1_34', 'U2AF2', 'ZRSR2', 'PRPF8',

'PPM1D', 'TP53',

'del1p', 'del3', 'del3q', 'del4q', 'del5', 'del5q', 'del7', 'del7q', 'del9q', 'del11p', 'del11q', 'del12', 'del12p', 'del13', 'del13q',
'del3p', 'del14', 'del15', 'del16', 'del16q', 'del17', 'del17p', 'del18', 'del20', 'del20q', 'del21', 'del22', 'delX', 'delY',
'plus1', 'plus1p', 'plus1q', 'plus8', 'plus9', 'plus11', 'plus11q', 'plus13', 'plus14', 'plus15', 'plus17p', 'plus17q', 'plus19', 'plus21', 'plus22',
'ring', 'mar', 'WGA', 'r_3_3', 'r_9_9', 'r_1_7'
)
length(col_order) # check that we have all 157 categories
157
In [19]:
# prepare and sort long datframe
long_dd_clustered <- melt(dd_clustered, id = c('patient_key', sprintf('component_%d', 0:11), 'predicted_component', 'max_proba'))
long_dd_clustered$variable <- factor(long_dd_clustered$variable, level = col_order)
long_dd_clustered$predicted_component <- factor(long_dd_clustered$predicted_component, level = categories_table$values)
long_dd_clustered <- long_dd_clustered[order(long_dd_clustered$predicted_component),]
long_dd_clustered$patient_key <- factor(long_dd_clustered$patient_key, levels = unique(long_dd_clustered$patient_key))

# set a high-definition resolution for this plot (DPI)
options(repr.plot.res = 200)
set_notebook_plot_size(10, 10)

# plot the heatmap
ggplot(long_dd_clustered, aes(patient_key, variable)) +

    # geom raster heatmap
    geom_raster(aes(fill = factor(value)), show.legend = FALSE) +
    scale_fill_manual(values = c('0' = 'lightgrey', '1' = 'steelblue')) +

    # column separation and row gene grouping (+ 0.5 to be exactly at the good location)
    geom_vline(xintercept = col_sep[-length(col_sep)] + 0.5, col = 'white', linetype = 3, size = 0.3) +  # [-length(col_sep)] to remove last separator
    geom_hline(yintercept = row_sep_cum + 0.5, col = 'white', linetype = 3, size = 0.3) +

    # 3d effect by adding white and grey horizontal line
    geom_hline(yintercept = seq(1, 157, 2) + 0.5, col = 'white', linetype = 1, size = 0.03) +
    geom_hline(yintercept = seq(0, 157, 2) + 0.5, col = 'darkgrey', linetype = 1, size = 0.04) +
    
    # legend parameters
    theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.y = element_blank(), axis.text.y = element_blank(), plot.margin = unit(c(1, 1, 0.2, 1), 'cm')) + # top right bottom left
    labs(x = 'patients') +
    coord_cartesian(xlim = c(0, 3300), ylim = c(1.5, 156.5), clip = 'off') +

    # colored gene label
    annotate('text', x = rep(-15, 157) , y = 1:157, label = col_order, color = row_colors, size = 1.5, hjust = 1) +
    annotate('text', x = rep(3315, 157), y = 1:157, label = col_order, color = row_colors, size = 1.5, hjust = 0) +

    # component label
    annotate('text', x = col_sep - (categories_table$count / 2), y = c(rep(159, 11), 161), label = categories_table$values, size = 3, hjust = 0.5, vjust = 0.5)

Categories distribution by component

In [20]:
categories_table
valuescountfreq
1 665 20.2%
4 654 19.8%
3 463 14%
2 355 10.8%
6 287 8.7%
5 243 7.4%
7 205 6.2%
NaN 164 5%
8 151 4.6%
9 75 2.3%
10 20 0.6%
11 18 0.5%
In [21]:
# create a dataframe showing the count by component for each category

categories_repartition <- data.frame(category = colnames(dd_all))

for (i in 1:n_components)
    categories_repartition[sprintf('component_%d', i)] <- apply(categories_repartition, 1, function(s) sum(dd_clustered[dd_clustered$predicted_component == i, s['category']]))
                   
# sort categories_repartition by most frequent genes
categories_repartition['total_count'] <- rowSums(categories_repartition[,-1])
categories_repartition <- categories_repartition[order(categories_repartition$total_count, decreasing = TRUE),]
head(categories_repartition)
categorycomponent_1component_2component_3component_4component_5component_6component_7component_8component_9component_10component_11total_count
97TET2 412 32 149 234 38 43 83 21 11 5 0 1028
3ASXL1 169 18 298 72 125 31 164 22 1 2 3 905
85SF3B1 101 16 42 410 23 114 10 15 5 0 0 736
92SRSF2 202 8 274 21 48 2 4 1 4 3 2 569
27DNMT3A 35 44 37 213 29 73 8 49 36 3 3 530
108del5q 21 217 12 11 13 166 2 3 7 9 0 461
In [22]:
# plot category distribution colored by component count

# transform (extand) categories_repartition in a long data frame by gathering the components columns in one column
long_categories_repartition <- melt(categories_repartition, id = c('category', 'total_count'))
# sort long_gene_repartition by most frequent genes
long_categories_repartition$category <- factor(long_categories_repartition$category, levels = categories_repartition$category)
long_categories_repartition <- long_categories_repartition[order(long_categories_repartition$category),]

set_notebook_plot_size(25, 7)
# count plot
ggplot(long_categories_repartition) +
    geom_bar(aes(category, value, fill = variable), stat = 'identity') +
    tilt_x_label(70) +
    scale_fill_brewer(palette = 'Spectral', na.value = 'grey')

# proportion plot
ggplot(long_categories_repartition) +
    geom_bar(aes(category, value, fill = variable), stat = 'identity', position = 'fill') +
    tilt_x_label(70) +
    scale_fill_brewer(palette = 'Spectral', na.value = 'grey')
In [23]:
# plot category distribution by component

# sort long_gene_repartition by heatmap gene order
long_categories_repartition$category <- factor(long_categories_repartition$category, levels = col_order)
long_categories_repartition <- long_categories_repartition[order(long_categories_repartition$category),]

# plot category distribution by component
set_notebook_plot_size(25, 40)
ggplot(long_categories_repartition) + geom_bar(aes(category, value, fill = variable), stat='identity', alpha = 0.8) + tilt_x_label(70) + facet_wrap(~ variable, ncol = 1, scales = 'free')
In [24]:
# save dd_clustered
write.table(dd_clustered, file = 'data/dd_clustered.tsv', sep = '\t')
In [25]:
dd_clustered <- read.table("data/dd_clustered.tsv", sep = "\t", stringsAsFactors = FALSE, header = TRUE)
In [26]:
dd_clustered$predicted_component <- factor(dd_clustered$predicted_component)

Comutation plots

From Elsa:

Clinical correlations

In [27]:
# get clinical data
dd_clinical <- read.table('data/df_clinical_selected.tsv', sep = '\t', stringsAsFactors = FALSE, header = TRUE)

# merge with clustering data (that already contains mutation and cytogenetics data)
dd_clinical <- merge(dd_clustered, dd_clinical, by.x = 'patient_key', by.y = 'LEUKID')

print_size(dd_clinical)
head(dd_clinical, 2)
Size of dd_clinical: 3300 x 253
patient_keyARID1AARID2ASXL1ASXL2ATRXBCORBCORL1BRAFBRCC3CALRCBLCDKN1BCDKN2ACEBPACREBBPCSF1RCSF3RCSNK1A1CTCFCUX1DDX23DDX4DDX41DDX54DHX33DICER1DNMT3ADNMT3BEEDEGFREP300ETNK1ETV6EZH2FAM175AFLT3GATA1GATA2GNASGNB1HIPK2IDH1IDH2_140IDH2_172IRF1JAK2JARID2KDM5CKDM6AKITKMT2AKMT2CKMT2DKRASLUC7L2MGAMPLMYCNF1NFE2NIPBLNOTCH1NOTCH2NPM1NRASPAPD5PHF6PHIPPIK3CAPPM1DPRPF40BPRPF8PTPN11PTPRFRAD21RAD50RASGRF1RB1ROBO1ROBO2RRASRUNX1SETBP1SETD2SF3B1SH2B3SMC1ASMC3SMG1SPRED2SRCAPSRSF2STAG2STAT3SUZ12TERTTET2TP53U2AF1_157plus15r_1_7del22ringWGAcomponent_0component_1component_2component_3component_4component_5component_6component_7component_8component_9component_10component_11predicted_componentmax_probaUNIQUE_PATIENT_IDUNIQUE_SAMPLE_IDSEXMDS_TYPEDIAG_IPSSDIAG_IPSS_RAODSAMPLE_TYPEDIAG_RINGED_SIDEROBLASTSSAMPLE_IPSSSAMPLE_IPSS_RAGE_AT_SAMPLE_TIMESAMPLE_RINGED_SIDEROBLASTSDMTTYPE_OF_DISEASE_MODIFYING_TREATMENTTRANSPLANTTRANSPLANT_STAGETRANSPLANT_TYPEPRIOR_CHEMOTHERAPYPATIENT_RECEIVED_MORE_THAN_1_DMTPATIENT_RECEIVED_MORE_THAN_1_TPLDIAG_RINGED_SIDEROBLASTS_BINARY15CENTERCENTER_COHORTBATCHFLAG_KEEP_REASONDATE_OF_DIAGNOSIS_CORRECTDATE_OF_SAMPLE_CORRECTFLAG_ISSUE_DATEAMLSEX_INFER_SEQUENCINGFLAG_ISSUE_SEXDIAG_PB_BLAST_RANGESAMPLE_PB_BLAST_RANGEDIAG_RINGED_SIDEROBLASTS_RANGEDIAG_BM_BLAST_RANGESAMPLE_RINGED_SIDEROBLASTS_RANGESAMPLE_BM_BLAST_RANGEDIAG_RINGED_SIDEROBLASTS_BINARYSAMPLE_RINGED_SIDEROBLASTS_BINARYSAMPLE_RINGED_SIDEROBLASTS_BINARY15QC_DETAILEDSELECTED_SAMPLEtime_diag_sample_daystime_diag_sample_rangetype_of_dmtDMT_STRICTis_sample_naivedmt_stageWHO_2008FABPB_BLASTLDHFERRITINWBCANCHBPLTMONOCYTESBM_BLASTCYTOGENETICSNUMBER_OF_METAPHASESIPSSIPSS_RRINGED_SIDEROBLASTSRINGED_SIDEROBLASTS_BINARYRINGED_SIDEROBLASTS_BINARY15WHO_2008_SIMPLIFYtime_diag_dmt_strict_daystime_diag_transplant_daysis_treateddmt_activelenalidomidhmatransplantos_diag_yearsos_statusamltime_diag_aml_daysaml_statusaml_diag_years
E-H-100000-T1-1-D1-1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0.002222222 0.1820833 0.003333333 0.01875000 0.22791667 0.01763889 0.5155556 0.01625000 0.012638889 0.003055556 0.0005555556 0.0000000000 6 0.5155556 ihbt_1 276 F primary low good 57 BM granulocytes NA NA NA 57 NA yes LEN no NA NA NA NA NA NA IHBT IHBT_1 B pass 9-mar-04 9-mar-04 NA no F NA <5% NA NA <5% NA NA NA NA NA pass yes 0 < 1 month len yes naive after-sample del(5q) RA 0 7.3 226 7.50 4.65 8.8 406 NA 3 46,xx,del(5)(q15q33.3)[3]/46,xx[1] 22 low good NA NA NA 5q- 2432 NA treated yes 1 0 0 5.819178 0 no NA 0 5.819178
E-H-100001-T1-1-D1-1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.012592593 0.1511111 0.178518519 0.04259259 0.04777778 0.01111111 0.4777778 0.03074074 0.007037037 0.000000000 0.0403703704 0.0003703704 6 0.4777778 ihbt_2 541 F primary int1 good 56 BM granulocytes NA NA NA 56 NA no no NA NA NA NA NA NA IHBT IHBT_1 B pass 23-feb-07 23-feb-07 NA no F NA <5% NA NA <5% NA NA NA NA NA pass yes 0 < 1 month no naive no-treatment RA RA 0 5.0 103 7.21 2.88 9.8 364 NA 2 46,xx,del(5)(q13q33)[6]/47,xx,+8[1]/46,xx[15] 22 int1 good NA NA NA RCUD NA NA not-treated no 0 0 0 2.857534 0 no NA 0 2.857534

Continuous features

In [28]:
get_colors <- function(n, add_color = c()) {
    # Return a vector of n divergent and nice-looking colors
    # → Ex : get_colors(3, add_color = c('grey', 'blue')) ⟹ c('#9E0142', '#FEE08B', '#88CFA4', 'grey', 'blue')
    # → Arguments
    #     - n        : number of colors
    #     - add_color: if specified, add the given color vector at the end of the generated color vector
    
    colors <- c()
    
    if (n >= 5)
        colors <- colorRampPalette(brewer.pal(11, "Spectral"))(n)
    else
        colors <- c('#9E0142', '#FEE08B', '#88CFA4', '#5E4FA2')[1:n]
    
    return (c(colors, add_color))
}

get_colors(3, add_color = c('grey', 'blue'))
  1. '#9E0142'
  2. '#FEE08B'
  3. '#88CFA4'
  4. 'grey'
  5. 'blue'
In [29]:
plot_continuous_feature_by_categorical_feature <- function(data, continuous_feature_name, categorical_feature_name, scale_y_sqrt, legend_prefix = '') {
    # Plot the distribution of a continuous feature across a categorical feature, produce boxplot, violin plot and density plots side by side
    # → Arguments
    #     - data                    : dataframe with both the continous feature and the categorical feature as columns, the categorical feature MUST BE a factor
    #     - continuous_feature_name : name of the continuous feature to plot
    #     - categorical_feature_name: name of the categorical feature to plot
    #     - scale_y_sqrt            : wheter to plot with a sqrt y scale or not
    #     - legend_prefix           : add a prefix to a legend
    
    # helper function to transform a category name (ex: cat_1) to its name and its count (ex: cat_1 (n = 42))
    get_legend <- function(legend_prefix, x, categorical_count_table) {
        return (sprintf('%s%s (n = %d)', legend_prefix, x, categorical_count_table[categorical_count_table$values == x,]$count))
    }
    
    # create new categorical feature by adding (n = count) for each category, ordered as the original factor categorical feature
    categorical_count_table <- get_table(data[,categorical_feature_name], add_total_count = FALSE)
    data$categorical <- sapply(data[,categorical_feature_name], function(x) get_legend(legend_prefix, x, categorical_count_table))
    # order
    data$categorical <- factor(data$categorical, levels = sapply(levels(data[,categorical_feature_name]), function(x) get_legend(legend_prefix, x, categorical_count_table)))

    # remove NA values
    n_initial <- nrow(data)
    data <- data[! is.na(data[continuous_feature_name]),]


    set_notebook_plot_size(25, 7)
    colors <- get_colors(nrow(categorical_count_table))
    
    # boxplot
    p1 <- ggplot(data) +
              geom_boxplot(aes_string('categorical', continuous_feature_name, fill = 'categorical'), alpha = 0.7, notch = TRUE) +
              scale_fill_manual(values = colors) +
              ggtitle(sprintf('%s density by %s\n(removed %d NA values)', continuous_feature_name, categorical_feature_name, n_initial - nrow(data))) +
              theme(plot.title = element_text(size = 18, face = "bold")) +
              scale_x_discrete(labels = levels(data[,categorical_feature_name]), breaks = waiver())
    
    # violin plot
    p2 <- ggplot(data) + geom_violin(aes_string('categorical', continuous_feature_name, fill = 'categorical'), alpha = 0.7) +
              scale_fill_manual(values = colors) +
              ggtitle(' \n ') +
              theme(plot.title = element_text(size = 18, face = "bold")) +
              scale_x_discrete(labels = levels(data[,categorical_feature_name]), breaks = waiver())
    
    # density plot
    p3 <- ggplot(data) + geom_density(aes_string(continuous_feature_name, fill = 'categorical'), alpha = 0.7) +
              facet_wrap(~ categorical, ncol = 4) +
              scale_fill_manual(values = colors) +
              ggtitle(' \n ') +
              theme(plot.title = element_text(size = 18, face = "bold"))
                                       
    if (scale_y_sqrt) {
        p1 <- p1 + scale_y_sqrt()
        p2 <- p2 + scale_y_sqrt()
    }
    
    # plot the three plots side by side and remove ggplot warnings
    suppressWarnings(suppressMessages(grid.arrange(p1, p2, p3, ncol = 3)))
}

# plot_continuous_feature_by_categorical_feature(dd_clinical, 'HB', 'predicted_component', TRUE, 'comp_')
In [30]:
# plot the distribution of multiple continuous features accross components
for (feature_name in c('HB', 'PLT', 'ANC', 'WBC', 'MONOCYTES', 'PB_BLAST', 'BM_BLAST', 'RINGED_SIDEROBLASTS'))
    plot_continuous_feature_by_categorical_feature(dd_clinical, feature_name, 'predicted_component', TRUE)
In [31]:
plot_continuous_feature_by_categorical_feature(dd_clinical, 'AGE_AT_SAMPLE_TIME', 'predicted_component', FALSE)

Categorical features

In [32]:
plot_categorical_feature_by_categorical_feature <- function(data, feature_name_1, feature_name_2) {
    # Plot the distribution of a categorical feature across another categorical feature
    # → Arguments
    #     - data          : dataframe with both the two categorical features
    #     - feature_name_1: first categorical feature
    #     - feature_name_2: second categorical feature
    
    # remove NA values
    n_initial <- nrow(data)
    data <- data[!is.na(data[feature_name_1]),]
    
    set_notebook_plot_size(25, 5)
    colors <- get_colors(length(unique(data[,feature_name_2])))
    
    # categorical feature 1 distribution
    p1 <- ggplot(data) +
              geom_bar(aes_string(feature_name_2, fill = feature_name_1), position = 'fill') +
              ggtitle(sprintf('%s count and proportion by %s\n(removed %d NA values)', feature_name_1, feature_name_2, n_initial - nrow(data))) +
              theme(plot.title = element_text(size = 18, face = "bold"))
    
    # categorical feature 2 distribution
    p2 <- ggplot(data) +
              geom_bar(aes_string(feature_name_1, fill = feature_name_2), position = 'fill') +
              scale_fill_manual(values = colors) +
              ggtitle(' \n ') +
              theme(plot.title = element_text(size = 18, face = "bold"))
    
    # categorical feature count by component
    p3 <- ggplot(data) +
              geom_bar(aes_string(feature_name_1, fill = feature_name_2), position = 'dodge') +
              scale_fill_manual(values = colors) +
              ggtitle(' \n ') + 
              theme(plot.title = element_text(size = 18, face = "bold")) 
    
    # plot the three plots side by side and remove ggplot warnings
    suppressWarnings(suppressMessages(grid.arrange(p1, p2, p3, ncol = 3)))
}

# plot_categorical_feature_by_categorical_feature(dd_clinical, 'AML', 'predicted_component')
In [33]:
# plot the distribution of multiple categorical features accross components
for (feature_name in c('WHO_2008_SIMPLIFY', 'SEX'))
    plot_categorical_feature_by_categorical_feature(dd_clinical, feature_name, 'predicted_component')

By country analysis

In [34]:
# get countries data
dd_countries <- read.table('data/CentersBatches.csv', sep = ',', stringsAsFactors = FALSE, header = TRUE)

# merge with clinical data (that already contains mutation and cytogenetics data as well as clustering data)
dd_clinical <- merge(dd_clinical, unique(dd_countries[, c('center', 'country')]), by.x = 'CENTER', by.y = 'center')

print_size(dd_clinical)
head(dd_clinical, 2)
Size of dd_clinical: 3300 x 254
CENTERpatient_keyARID1AARID2ASXL1ASXL2ATRXBCORBCORL1BRAFBRCC3CALRCBLCDKN1BCDKN2ACEBPACREBBPCSF1RCSF3RCSNK1A1CTCFCUX1DDX23DDX4DDX41DDX54DHX33DICER1DNMT3ADNMT3BEEDEGFREP300ETNK1ETV6EZH2FAM175AFLT3GATA1GATA2GNASGNB1HIPK2IDH1IDH2_140IDH2_172IRF1JAK2JARID2KDM5CKDM6AKITKMT2AKMT2CKMT2DKRASLUC7L2MGAMPLMYCNF1NFE2NIPBLNOTCH1NOTCH2NPM1NRASPAPD5PHF6PHIPPIK3CAPPM1DPRPF40BPRPF8PTPN11PTPRFRAD21RAD50RASGRF1RB1ROBO1ROBO2RRASRUNX1SETBP1SETD2SF3B1SH2B3SMC1ASMC3SMG1SPRED2SRCAPSRSF2STAG2STAT3SUZ12TERTTET2TP53plus15r_1_7del22ringWGAcomponent_0component_1component_2component_3component_4component_5component_6component_7component_8component_9component_10component_11predicted_componentmax_probaUNIQUE_PATIENT_IDUNIQUE_SAMPLE_IDSEXMDS_TYPEDIAG_IPSSDIAG_IPSS_RAODSAMPLE_TYPEDIAG_RINGED_SIDEROBLASTSSAMPLE_IPSSSAMPLE_IPSS_RAGE_AT_SAMPLE_TIMESAMPLE_RINGED_SIDEROBLASTSDMTTYPE_OF_DISEASE_MODIFYING_TREATMENTTRANSPLANTTRANSPLANT_STAGETRANSPLANT_TYPEPRIOR_CHEMOTHERAPYPATIENT_RECEIVED_MORE_THAN_1_DMTPATIENT_RECEIVED_MORE_THAN_1_TPLDIAG_RINGED_SIDEROBLASTS_BINARY15CENTER_COHORTBATCHFLAG_KEEP_REASONDATE_OF_DIAGNOSIS_CORRECTDATE_OF_SAMPLE_CORRECTFLAG_ISSUE_DATEAMLSEX_INFER_SEQUENCINGFLAG_ISSUE_SEXDIAG_PB_BLAST_RANGESAMPLE_PB_BLAST_RANGEDIAG_RINGED_SIDEROBLASTS_RANGEDIAG_BM_BLAST_RANGESAMPLE_RINGED_SIDEROBLASTS_RANGESAMPLE_BM_BLAST_RANGEDIAG_RINGED_SIDEROBLASTS_BINARYSAMPLE_RINGED_SIDEROBLASTS_BINARYSAMPLE_RINGED_SIDEROBLASTS_BINARY15QC_DETAILEDSELECTED_SAMPLEtime_diag_sample_daystime_diag_sample_rangetype_of_dmtDMT_STRICTis_sample_naivedmt_stageWHO_2008FABPB_BLASTLDHFERRITINWBCANCHBPLTMONOCYTESBM_BLASTCYTOGENETICSNUMBER_OF_METAPHASESIPSSIPSS_RRINGED_SIDEROBLASTSRINGED_SIDEROBLASTS_BINARYRINGED_SIDEROBLASTS_BINARY15WHO_2008_SIMPLIFYtime_diag_dmt_strict_daystime_diag_transplant_daysis_treateddmt_activelenalidomidhmatransplantos_diag_yearsos_statusamltime_diag_aml_daysaml_statusaml_diag_yearscountry
CCH E-H-106024-T1-1-D1-1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.07111111 0.13611111 0.000000000 0.00000000 0.38166667 0.2088889 0.003333333 0.1977778 0.0005555556 0.0000 0.000000000 0.0005555556 4 0.3816667 129 129 M primary low very_good 70 BM_MNC NA NA NA 70 NA no NA no NA NA NA NA NA NA CCH_1 D pass 7-apr-10 1-jun-10 NA no M NA <5% <5% NA <5% NA <5% NA NA NA pass yes 55 1 to 6 months NA no naive no-treatment RCMD RA 0 NA NA 8.5 6.5 12.4 141 NA 2 46,xy[29]/45,x,-y[3] 32 low very_good NA NA NA RCMD NA NA not-treated no 0 0 0 3.350685 0 no NA 0 3.350685 france
CCH E-H-106025-T1-1-D1-1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0.02277778 0.03972222 0.002638889 0.09958333 0.03013889 0.1445833 0.009583333 0.6425000 0.0044444444 0.0025 0.001527778 0.0000000000 7 0.6425000 130 130 M primary int1 int 75 BM_MNC NA NA NA 75 NA no NA no NA NA NA NA NA NA CCH_1 D pass 26-jun-10 12-jul-10 NA no M NA <5% <5% NA <5% NA <5% NA NA NA pass yes 16 < 1 month NA no naive no-treatment RAEB-1 RAEB 0 NA NA 6.8 4.9 11.0 48 NA 5 47,xy,+8[10]/46,xy[10] 20 int1 int NA NA NA RAEB NA NA not-treated no 0 0 0 2.794521 1 no NA 0 2.794521 france
In [35]:
get_table(dd_clinical$country)
valuescountfreq
sweden 889 26.9%
germany 637 19.3%
italy 572 17.3%
spain 317 9.6%
netherland 198 6%
france 159 4.8%
brazil 119 3.6%
taiwan 107 3.2%
austria 83 2.5%
united_states 70 2.1%
greece 66 2%
england 50 1.5%
czech_republica 33 1%
-- total -- 3300 100%
In [36]:
# gather european countries
dd_clinical$region <- dd_clinical$country
dd_clinical$region[dd_clinical$country %in% c('austria', 'czech_republica', 'england', 'france', 'germany', 'greece', 'italy', 'netherland', 'spain', 'sweden')] <- 'europe'
get_table(dd_clinical$region)
valuescountfreq
europe 3004 91%
brazil 119 3.6%
taiwan 107 3.2%
united_states 70 2.1%
-- total -- 3300 100%
In [37]:
plot_categorical_feature_by_categorical_feature(dd_clinical, 'region', 'predicted_component')

Survival plots

In [38]:
plot_survival_curve <- function(data, stratifier_column_name, colors, line_size = 1.5, legend_size = 15, legend_interspace = 1.3, surv_diag_years = 'os_diag_years', surv_status = 'os_status', cum_event = FALSE) {
    # Plot the survival curve of the given data on the given stratifier
    # → Arguments
    #     - data
    #     - stratifier_column_name: name of the column on which to stratify the survival plot
    #     - colors                : vector of colors of the same size as the number of stratification
    #     - line_size
    #     - legend_size
    #     - legend_interspace     : y space between legend lines
    #     - surv_diag_years       : formula variable 1
    #     - surv_status           : formula variable 2
    #     - cum_event             : set to true for cumulative event curve rather than survival curve

    if (cum_event) {
        fun <- 'event'
        ylab <- 'cumulative incidence'
    }
    else {
        fun <- NULL
        ylab <- 'survival probability'
    }
    
    # evaluate the survival curve
    formula <- sprintf('Surv(%s, %s) ~ %s', surv_diag_years, surv_status, stratifier_column_name)
    surv_fit <- surv_fit(as.formula(formula), data = data)
    
    # set legend labels
    short_names <- sapply(names(surv_fit$strata), function (s) strsplit(s, '=')[[1]][[2]])
    legend_labels <- sprintf('%s (n = %s)', short_names, surv_fit$strata)
    
    # create plot object
    ggsurv <- ggsurvplot(surv_fit,
                         fun = fun,
                         censor = FALSE,
                         palette = colors,
                         linetype = 'solid',
                         size = line_size,
                         xlab = 'years',
                         ylab = ylab,
                         legend = c(0.6, 0.7),
                         legend.title = '',
                         legend.labs = legend_labels)
                          
    # modify plot object legend properties
    ggsurv$plot <- ggsurv$plot + theme(legend.text = element_text(size = legend_size), legend.key.size = unit(legend_interspace, 'lines'))

    # return ggplot object
    return (ggsurv$plot)
}

set_notebook_plot_size(8, 4)
plot_survival_curve(dd_clinical, 'predicted_component', get_colors(11, 'grey'), line_size = 0.8, legend_size = 10, legend_interspace = 0.8)
In [39]:
plot_survival_curve_stratified_by_categorical_feature <- function(data, mask, wildtype_mask, stratifier_column_name, legend_prefix, colors, remove_na = FALSE, wildtype_name = 'wildtype', AML = FALSE) {
    # Plot the survival curve of the given data on the given stratifier with the given mask and wildtypes
    # → Arguments
    #     - data
    #     - mask                  : mask of the data on which to stratify
    #     - wildtype_mask         : mask for the wildtype
    #     - stratifier_column_name: name of the column on which to stratify the survival plot
    #     - legend_prefix         : prefix for the masked and stratified legend
    #     - colors                : vector of colors of the same size as the number of stratification
    #     - remove_na             : if TRUE remove NA on stratifier
    #     - wildtype_name         : name of the wildtype curve
    #     - AML                   : whether to plot a cumulative incidence plot for AML progression or not
    
    # create stratification
    data$stratify <- wildtype_name
    
    if (stratifier_column_name == '')
        data[mask, 'stratify'] <- legend_prefix
    else
        data[mask, 'stratify'] <- sprintf('%s%s', legend_prefix, data[mask, stratifier_column_name])
    data <- data[mask | wildtype_mask,]

    # remove NA values
    if (stratifier_column_name != '' & remove_na)
        data <- data[wildtype_mask | (mask & ! is.na(data[,stratifier_column_name])),]
    
    # plot
    if (AML)
        return (plot_survival_curve(data, 'stratify', colors, surv_diag_years = 'aml_diag_years', surv_status = 'aml_status', cum_event = TRUE))
    else
        return (plot_survival_curve(data, 'stratify', colors))
}

# set_notebook_plot_size(7, 7)
# plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
#                                                       mask = (dd_clinical$SF3B1 == 1),
#                                                       wildtype_mask = (dd_clinical$SF3B1 == 0),
#                                                       stratifier_column_name = '',
#                                                       legend_prefix = 'has_SF3B1_mutation',
#                                                       colors = c('orange', 'grey'),
#                                                       remove_na = FALSE)
In [40]:
plot_survival_curve_by_gene <- function(dd_clinical, gene_name, components, AML = FALSE) {
    # Plot the survival curve of the given data on the given stratifier with the given gene and components stratifier
    # → Arguments
    #     - dd_clinical
    #     - gene_name
    #     - components: vector of components on which to stratify
    #     - AML       : whether to plot cumulative incidence plots for AML progression or not
    
    # has gene vs wildtype
    p1 <- plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
                                                                mask = (dd_clinical[,gene_name] == 1),
                                                                wildtype_mask = (dd_clinical[,gene_name] == 0),
                                                                stratifier_column_name = '',
                                                                legend_prefix = sprintf('has_%s_mutation', gene_name),
                                                                colors = c('orange', 'grey'),
                                                                remove_na = FALSE,
                                                                AML = AML)
    
    # stratify by given components and a "other" component
    dd_clinical$predicted_component <- as.character(dd_clinical$predicted_component)
    dd_clinical$predicted_component[! dd_clinical$predicted_component %in% components] <- 'other'
    
    # has gene and belongs to component vs wildtype
    p2 <- plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
                                                                mask = (dd_clinical[,gene_name] == 1),
                                                                wildtype_mask = (dd_clinical[,gene_name] == 0),
                                                                stratifier_column_name = 'predicted_component',
                                                                legend_prefix = sprintf('_%s_comp_', gene_name),
                                                                colors = get_colors(length(components) + 1, 'grey'),
                                                                remove_na = FALSE,
                                                                AML = AML)
    
    set_notebook_plot_size(20, 5)
    grid.arrange(p1, p2, ncol = 2)
}

# plot_survival_curve_by_gene(dd_clinical, 'SF3B1', c(4, 6))
In [41]:
# plot survival curves stratified for selected gene and components
genes <- c('SF3B1', 'SRSF2', 'U2AF1_157', 'U2AF1_34', 'ZRSR2', 'del5q', 'TP53')
components <- list(c(4, 6), c(1, 3, 5), c(5, 7), c(8), c(1, 7), c(2, 6), c(2, 6))

for (i in 1:length(genes))
    plot_survival_curve_by_gene(dd_clinical, genes[[i]], components[[i]])

IPSSR

In [42]:
get_table(dd_clinical$IPSS_R)
valuescountfreq
good 924 36.7%
int 530 21%
very_good 394 15.6%
poor 386 15.3%
very_poor 286 11.3%
-- total --3300 100%
In [43]:
# merge some IPSSR categories
dd_clinical$IPSS_R_simplified <- dd_clinical$IPSS_R
dd_clinical$IPSS_R_simplified[dd_clinical$IPSS_R %in% c('very_good', 'good')] <- 'good'
dd_clinical$IPSS_R_simplified[dd_clinical$IPSS_R %in% c('very_poor', 'poor')] <- 'poor'
In [44]:
plot_categorical_feature_by_categorical_feature(dd_clinical, 'IPSS_R_simplified', 'predicted_component')
In [45]:
# plot survival curves stratified for selected IPSS categories and components
plots <- lapply(levels(dd_clinical$predicted_component), function(i) plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
                                                                                                                           mask = (dd_clinical$predicted_component == i),
                                                                                                                           wildtype_mask = (dd_clinical$predicted_component != i),
                                                                                                                           stratifier_column_name = 'IPSS_R_simplified',
                                                                                                                           legend_prefix = sprintf('comp_%s_IPSSR', i),
                                                                                                                           colors = c('#ABDDA4', '#FDAE61', '#D7191C', 'grey'),
                                                                                                                           remove_na = TRUE))

set_notebook_plot_size(25, 25)
grid.arrange(grobs = plots, ncol = 3)
In [46]:
set_notebook_plot_size(5, 4)
plot_survival_curve(dd_clinical, 'IPSS_R_simplified', c('#ABDDA4', '#FDAE61', '#D7191C'))
In [47]:
set_notebook_plot_size(10, 4)
plot_survival_curve(dd_clinical, 'WHO_2008_SIMPLIFY', get_colors(9))
In [48]:
set_notebook_plot_size(7, 4)

mask          <- (! is.na(dd_clinical$IPSS_R_simplified) & dd_clinical$IPSS_R_simplified == 'good' & dd_clinical$predicted_component %in% c(1, 3, 4, 6))
wildtype_mask <- (! is.na(dd_clinical$IPSS_R_simplified) & dd_clinical$IPSS_R_simplified == 'int')

plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
                                                      mask = mask,
                                                      wildtype_mask = wildtype_mask,
                                                      stratifier_column_name = 'predicted_component',
                                                      legend_prefix = '_IPSSR_good_comp',
                                                      colors = get_colors(4, 'grey'),
                                                      wildtype_name = 'IPSSR_int')
In [49]:
set_notebook_plot_size(7, 4)

mask          <- (! is.na(dd_clinical$IPSS_R_simplified) & dd_clinical$IPSS_R_simplified == 'int' & dd_clinical$predicted_component %in% c(1, 3, 4, 6))
wildtype_mask <- (! is.na(dd_clinical$IPSS_R_simplified) & dd_clinical$IPSS_R_simplified == 'poor')

plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
                                                      mask = mask,
                                                      wildtype_mask = wildtype_mask,
                                                      stratifier_column_name = 'predicted_component',
                                                      legend_prefix = '_IPSSR_int_comp',
                                                      colors = get_colors(4, 'grey'),
                                                      wildtype_name = 'IPSSR_poor')
In [50]:
set_notebook_plot_size(7, 4)

mask          <- (! is.na(dd_clinical$IPSS_R_simplified) & dd_clinical$IPSS_R_simplified == 'poor' & dd_clinical$predicted_component %in% c(2, 3, 4))
wildtype_mask <- (! is.na(dd_clinical$IPSS_R_simplified) & dd_clinical$IPSS_R_simplified == 'int')

plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
                                                      mask = mask,
                                                      wildtype_mask = wildtype_mask,
                                                      stratifier_column_name = 'predicted_component',
                                                      legend_prefix = '_IPSSR_poor_comp',
                                                      colors = get_colors(3, 'grey'),
                                                      wildtype_name = 'IPSSR_int')
In [51]:
set_notebook_plot_size(7, 4)

mask          <- (! is.na(dd_clinical$IPSS_R_simplified) & dd_clinical$IPSS_R_simplified == 'good' & dd_clinical$predicted_component == 3)
wildtype_mask <- (! is.na(dd_clinical$IPSS_R_simplified) & dd_clinical$IPSS_R_simplified == 'poor' & dd_clinical$predicted_component == 4)

plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
                                                      mask = mask,
                                                      wildtype_mask = wildtype_mask,
                                                      stratifier_column_name = 'predicted_component',
                                                      legend_prefix = '_IPSSR_good_comp',
                                                      colors = get_colors(2),
                                                      wildtype_name = '_IPSSR_poor_comp_4')
In [52]:
set_notebook_plot_size(7, 4)

mask          <- (! is.na(dd_clinical$WHO_2008_SIMPLIFY) & dd_clinical$WHO_2008_SIMPLIFY == '5q-' & dd_clinical$predicted_component == 6)
wildtype_mask <- (! is.na(dd_clinical$WHO_2008_SIMPLIFY) & dd_clinical$WHO_2008_SIMPLIFY == '5q-' & dd_clinical$predicted_component != 6)

plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
                                                      mask = mask,
                                                      wildtype_mask = wildtype_mask,
                                                      stratifier_column_name = 'predicted_component',
                                                      legend_prefix = '_WHO_5q-_comp_',
                                                      colors = get_colors(2),
                                                      wildtype_name = '_WHO_5q-_other_comp')

VAF distribution

In [53]:
# get maf data
dd_maf <- read.table('data/df_maf_driver_selected.tsv', sep = '\t', stringsAsFactors = FALSE, header = TRUE)

# merge with dd_clustered
dd_maf_clustered <- merge(dd_maf, dd_clustered[,c('patient_key', 'predicted_component')], by.x = 'TARGET_NAME', by.y = 'patient_key')

print_size(dd_maf_clustered)
head(dd_maf_clustered, 2)
Size of dd_maf_clustered: 11808 x 161
TARGET_NAMEID_VARIANTREFERENCE_NAMECHRSTARTENDREFALTCONTEXT_5CONTEXT_3QUALCALLED_BYPASSED_BYNUMBER_OF_CALLERSFLAGS_ALLTARGET_VAF_MEANTARGET_VAF_STDREFERENCE_VAF_MEANREFERENCE_VAF_STDmutect_TARGET_VAFmutect_TARGET_DEPTHmutect_REFERENCE_VAFmutect_REFERENCE_DEPTHmutect_DIRPROPmutect_READS_FORWARDmutect_READS_REVERSEstrelka_TARGET_VAFstrelka_TARGET_DEPTHstrelka_REFERENCE_VAFstrelka_REFERENCE_DEPTHcaveman.pindel_TARGET_VAFcaveman.pindel_TARGET_DEPTHcaveman.pindel_REFERENCE_VAFcaveman.pindel_REFERENCE_DEPTHcaveman.pindel_DIRPROPcaveman.pindel_READS_FORWARDcaveman.pindel_READS_REVERSEVAG_VDVAG_VTVAG_GENEVAG_TRANSCRIPTVAG_RNA_CHANGEVAG_cDNA_CHANGEVAG_PROTEIN_CHANGEVAG_EFFECTPROTEIN_CHANGEEFFECTVAG_AMBIGUOUS_ANNOTVEP_ConsequenceVEP_IMPACTVEP_SYMBOLVEP_Feature_typeVEP_FeatureVEP_BIOTYPEVEP_EXONVEP_INTRONVEP_HGVScVEP_HGVSpVEP_Amino_acidsVEP_CodonsVEP_Existing_variationVEP_DISTANCEVEP_STRANDVEP_FLAGSVEP_VARIANT_CLASSVEP_SYMBOL_SOURCEVEP_HGNC_IDVEP_CANONICALVEP_CCDSVEP_SWISSPROTVEP_TREMBLVEP_UNIPARCVEP_SOURCEVEP_GENE_PHENOVEP_SIFTVEP_PolyPhenVEP_DOMAINSVEP_HGVS_OFFSETVEP_AFVEP_gnomAD_AFVEP_MAX_AFVEP_MAX_AF_POPSVEP_CLIN_SIGVEP_SOMATICVEP_PHENOVEP_MOTIF_NAMEVEP_MOTIF_POSVEP_HIGH_INF_POSVEP_MOTIF_SCORE_CHANGEVEP_gnomAD_genomeVEP_gnomAD_genome_AC.AN_AFRVEP_gnomAD_genome_AF_AFRVEP_gnomAD_genome_AC.AN_AMRVEP_gnomAD_genome_AF_AMRVEP_gnomAD_genome_AC.AN_ASJVEP_gnomAD_genome_AF_ASJVEP_gnomAD_genome_AC.AN_EASVEP_gnomAD_genome_AF_EASVEP_gnomAD_genome_AC.AN_FINVEP_gnomAD_genome_AF_FINVEP_gnomAD_genome_AC.AN_NFEVEP_gnomAD_genome_AF_NFEVEP_gnomAD_genome_AC.AN_OTHVEP_gnomAD_genome_AF_OTHVEP_gnomAD_genome_AC.AN_MaleVEP_gnomAD_genome_AF_MaleVEP_gnomAD_genome_AC.AN_FemaleVEP_gnomAD_genome_AF_FemaleVEP_gnomAD_exomeVEP_gnomAD_exome_AC.AN_AFRVEP_gnomAD_exome_AF_AFRVEP_gnomAD_exome_AC.AN_AMRVEP_gnomAD_exome_AF_AMRVEP_gnomAD_exome_AC.AN_ASJVEP_gnomAD_exome_AF_ASJVEP_gnomAD_exome_AC.AN_EASVEP_gnomAD_exome_AF_EASVEP_gnomAD_exome_AC.AN_FINVEP_gnomAD_exome_AF_FINVEP_gnomAD_exome_AC.AN_NFEVEP_gnomAD_exome_AF_NFEVEP_gnomAD_exome_AC.AN_OTHVEP_gnomAD_exome_AF_OTHVEP_gnomAD_exome_AC.AN_MaleVEP_gnomAD_exome_AF_MaleVEP_gnomAD_exome_AC.AN_FemaleVEP_gnomAD_exome_AF_FemaleCOSMICCENTERvariant.keyNUM_FLAGS_MUTECTNUM_FLAGS_STRELKANUM_FLAGS_CAVEMAN.PINDELNUM_FLAGSsum_normalssum_normals_callermedian_vaf_normalsFINAL_SOMATIC_ANNOTATIONElsa_commentElsa_todoublecheckElsa_todo_check_indelIS_TRUNCATEDIS_COSMIC_EXACTIS_COSMIC_POSCOUNTprotein.keyaa_numberCENSUSCENSUSKBPAN_HOTSPOTHEMEPACT_COUNTELLI_HOTSPOTDRIVERFLAG_DRIVER_CATEGORYElsa_oncogenicVAFREFERENCE_VAFREMOVE_CURATION_MERGEANNOT_PHASEANNOT_PHASE_2predicted_component
E-H-100000-T1-1-D1-1 c06198e2-9200-11e7-b5e7-eb21424a729f I-H-100778-N1-1-D1-1 11 119149248 119149248 G A TTTCT CCGAT NA C,S,M M,S 2 MNP 0.083 0.003 0.001 0 0.087 917 0.001 1319 1.0 40 40 0.079 1308 0.001 1613 0.082 1308 0.001 1612 0.672 64 43 CBL|CCDS8418.1|r.1632g>a|c.1256G>A|p.C419Y|protein_coding:CDS:exon:substitution:codon_variant:non_synonymous_codon|SO:0000010:SO:0000316:SO:0000147:SO:1000002:SO:0001581:SO:0001583 Sub CBL CCDS8418.1 r.1632g>a c.1256G>A p.C419Y non_synonymous_codon p.C419Y non_synonymous_codon 1 missense_variant MODERATE CBL Transcript ENST00000264033 protein_coding 9|16 ENST00000264033.4:c.1256G>A ENSP00000264033.3:p.Cys419Tyr C/Y tGc/tAc COSM5945221 NA 1 SNV HGNC 1541 YES CCDS8418.1 P22681 UPI000013D4A7 NA 1 deleterious(0) probably_damaging(1) Gene3D:3.30.40.10&Pfam_domain:PF13920&PROSITE_profiles:PS50089&hmmpanther:PTHR23007&hmmpanther:PTHR23007:SF5&SMART_domains:SM00184&Superfamily_domains:SSF57850 NA NA NA NA 1 1 NA NA NA NA NA NA NA NA NA NA NA NA NA 11:119149248-119149248 0 | 15304 0 0 | 33582 0 0 | 9850 0 0 | 17248 0 0 | 22300 0 1 | 111614 8.96e-06 0 | 5482 0 1 | 134846 7.42e-06 0 | 111316 0 GENOMIC_EXACT;CBL_p.C419Y;ALL=2;SOM=0;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=2|GENOMIC_CLOSE;CBL_p.C416Y;ALL=3;SOM=0;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=3|GENOMIC_CLOSE;CBL_p.C416F;ALL=1;SOM=1;GENITAL_TRACT=1|GENOMIC_CLOSE;CBL_p.C416W;ALL=2;SOM=0;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=2|GENOMIC_CLOSE;CBL_p.P417A;ALL=2;SOM=0;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=2|GENOMIC_CLOSE;CBL_p.P417S;ALL=2;SOM=0;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=2|GENOMIC_CLOSE;CBL_p.P417H;ALL=2;SOM=2;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=1;SKIN=1|GENOMIC_CLOSE;CBL_p.P417R;ALL=2;SOM=0;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=2|GENOMIC_CLOSE;CBL_p.P417L;ALL=7;SOM=7;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=4;SKIN=3|GENOMIC_CLOSE;CBL_p.F418I;ALL=2;SOM=2;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=1;SKIN=1|GENOMIC_CLOSE;CBL_p.F418S;ALL=6;SOM=0;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=6|GENOMIC_CLOSE;CBL_p.F418L;ALL=3;SOM=3;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=2;SKIN=1|GENOMIC_CLOSE;CBL_p.C419R;ALL=2;SOM=2;GENITAL_TRACT=1;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=1|GENOMIC_CLOSE;CBL_p.R420G;ALL=2;SOM=2;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=1;SKIN=1|GENOMIC_CLOSE;CBL_p.R420*;ALL=2;SOM=0;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=1;ENDOMETRIUM=1|GENOMIC_CLOSE;CBL_p.R420Q;ALL=17;SOM=17;CENTRAL_NERVOUS_SYSTEM=1;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=16|GENOMIC_CLOSE;CBL_p.R420P;ALL=2;SOM=2;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=2|GENOMIC_CLOSE;CBL_p.R420L;ALL=1;SOM=0;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=1|GENOMIC_CLOSE;CBL_p.I423_E427delIKGTE;ALL=1;SOM=1;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=1|GENOMIC_CLOSE;CBL_p.C421W;ALL=1;SOM=1;LARGE_INTESTINE=1IHBT 11_119149248_G_A 0 0 0 0 0 0 0 OK NA NA not-truncated 1 0 1 CBL_C419 419 oncogene/TSG TSG 0 0 1 driver recurrence_cosmic 0.083 0.001 6
E-H-100000-T1-1-D1-1 5_131822301_G_T I-H-100778-N1-1-D1-1 5 131822301 131822301 G T ACTGT TAGCT NA S,M M,S 2 PASS 0.022 0.002 0.002 0 0.024 374 0.002 1145 0.8 5 4 0.021 532 0.001 1376 NA NA NA NA NA NA NA IRF1|CCDS4155.1|r.751c>a|c.492C>A|p.Y164*|protein_coding:CDS:exon:substitution:codon_variant:stop_gained|SO:0000010:SO:0000316:SO:0000147:SO:1000002:SO:0001581:SO:0001587 Sub IRF1 CCDS4155.1 r.751c>a c.492C>A p.Y164* stop_gained p.Y164* stop_gained 1 stop_gained HIGH IRF1 Transcript ENST00000245414 protein_coding 6|10 ENST00000245414.4:c.492C>A ENSP00000245414.4:p.Tyr164Ter Y/* taC/taA NA -1 SNV HGNC 6116 YES CCDS4155.1 P10914 R4GNI0&Q75MZ8&Q6FHN8&C9JD95 UPI000012D885 NA 1 PIRSF_domain:PIRSF038196&hmmpanther:PTHR11949&hmmpanther:PTHR11949:SF3 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA GENOMIC_CLOSE;IRF1_p.P167fs*42;ALL=1;SOM=1;HAEMATOPOIETIC_AND_LYMPHOID_TISSUE=1|GENOMIC_CLOSE;IRF1_p.Y164C;ALL=1;SOM=1;BONE=1 IHBT 5_131822301_G_T 0 0 0 0 0 0 0 OK NA NA truncated 0 0 1 IRF1_Y164 164 not_known not_known 0 0 0 driver NA possible 0.022 0.002 6
In [54]:
plot_continuous_feature_by_categorical_feature(dd_maf_clustered, 'TARGET_VAF_MEAN', 'predicted_component', FALSE)
In [55]:
plot_continuous_feature_stratified_by_component_and_gene <- function(data, gene, components) {
    data <- data[data$VAG_GENE == gene,]
    data$category <- sprintf('%s_other', gene)
    mask <- data$VAG_GENE == gene & data$predicted_component %in% components
    data$category[mask] <- sprintf('%s_comp_%s', gene, data$predicted_component[mask])
    data$category <- factor(data$category)
    
    plot_continuous_feature_by_categorical_feature(data, 'VAF', 'category', FALSE)
}

# plot_continuous_feature_stratified_by_component_and_gene(dd_maf_clustered, 'SF3B1', c(4, 6))
In [56]:
# plot VAF distribution stratified by selected genes and components
genes <- c('SF3B1', 'SRSF2', 'U2AF1', 'ZRSR2', 'TP53')
components <- list(c(4, 6), c(1, 3, 5), c(5, 7, 8), c(1, 7), c(2, 6))

for (i in 1:length(genes))
    plot_continuous_feature_stratified_by_component_and_gene(dd_maf_clustered, genes[[i]], components[[i]])

AML

In [57]:
plot_categorical_feature_by_categorical_feature(dd_clinical, 'AML', 'predicted_component')
In [58]:
set_notebook_plot_size(12, 4)
plot_survival_curve(dd_clinical, 'predicted_component', get_colors(11, 'grey'), line_size = 0.8, legend_size = 10, legend_interspace = 0.8, surv_diag_years = 'aml_diag_years', surv_status = 'aml_status', cum_event = TRUE)
In [59]:
# plot AML cumulative incidence curves stratified for selected gene and components
genes <- c('SF3B1', 'SRSF2', 'U2AF1_157', 'U2AF1_34', 'ZRSR2', 'del5q', 'TP53')
components <- list(c(4, 6), c(1, 3, 5), c(5, 7), c(8), c(1, 7), c(2, 6), c(2, 6))

for (i in 1:length(genes))
    plot_survival_curve_by_gene(dd_clinical, genes[[i]], components[[i]], AML = TRUE)
In [60]:
set_notebook_plot_size(5, 4)
plot_survival_curve(dd_clinical, 'IPSS_R_simplified', c('#ABDDA4', '#FDAE61', '#D7191C'), surv_diag_years = 'aml_diag_years', surv_status = 'aml_status', cum_event = TRUE)
In [61]:
set_notebook_plot_size(10, 4)
plot_survival_curve(dd_clinical, 'WHO_2008_SIMPLIFY', get_colors(9), surv_diag_years = 'aml_diag_years', surv_status = 'aml_status', cum_event = TRUE)
In [62]:
set_notebook_plot_size(7, 4)

mask          <- (! is.na(dd_clinical$IPSS_R_simplified) & dd_clinical$IPSS_R_simplified == 'good' & dd_clinical$predicted_component %in% c(1, 3, 4, 6))
wildtype_mask <- (! is.na(dd_clinical$IPSS_R_simplified) & dd_clinical$IPSS_R_simplified == 'int')

plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
                                                      mask = mask,
                                                      wildtype_mask = wildtype_mask,
                                                      stratifier_column_name = 'predicted_component',
                                                      legend_prefix = '_IPSSR_good_comp',
                                                      colors = get_colors(4, 'grey'),
                                                      wildtype_name = 'IPSSR_int',
                                                      AML = TRUE)
In [63]:
set_notebook_plot_size(7, 4)

mask          <- (! is.na(dd_clinical$IPSS_R_simplified) & dd_clinical$IPSS_R_simplified == 'int' & dd_clinical$predicted_component %in% c(1, 3, 4, 6))
wildtype_mask <- (! is.na(dd_clinical$IPSS_R_simplified) & dd_clinical$IPSS_R_simplified == 'poor')

plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
                                                      mask = mask,
                                                      wildtype_mask = wildtype_mask,
                                                      stratifier_column_name = 'predicted_component',
                                                      legend_prefix = '_IPSSR_int_comp',
                                                      colors = get_colors(4, 'grey'),
                                                      wildtype_name = 'IPSSR_poor',
                                                      AML = TRUE)
In [64]:
set_notebook_plot_size(7, 4)

mask          <- (! is.na(dd_clinical$IPSS_R_simplified) & dd_clinical$IPSS_R_simplified == 'poor' & dd_clinical$predicted_component %in% c(2, 3, 4))
wildtype_mask <- (! is.na(dd_clinical$IPSS_R_simplified) & dd_clinical$IPSS_R_simplified == 'int')

plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
                                                      mask = mask,
                                                      wildtype_mask = wildtype_mask,
                                                      stratifier_column_name = 'predicted_component',
                                                      legend_prefix = '_IPSSR_poor_comp',
                                                      colors = get_colors(3, 'grey'),
                                                      wildtype_name = 'IPSSR_int',
                                                      AML = TRUE)

OS multivariate

Work in progress...

Simple comparison of some multivariate model

In [65]:
# let's transform every variable we are going to use to factors 
dd_clinical$IPSS_R      <- factor(dd_clinical$IPSS_R)
dd_clinical$SEX         <- factor(dd_clinical$SEX)
dd_clinical$dmt_active  <- factor(dd_clinical$dmt_active)
dd_clinical$lenalidomid <- factor(dd_clinical$lenalidomid)
dd_clinical$hma         <- factor(dd_clinical$hma)
dd_clinical$transplant  <- factor(dd_clinical$transplant)

Let's compare the concordance score with 4 sets of features:

  • without components nor IPSSR
  • with components only
  • with IPSSR only
  • with both
In [66]:
for (features in c('AGE_AT_SAMPLE_TIME + SEX + dmt_active + lenalidomid + hma + transplant',
                   'AGE_AT_SAMPLE_TIME + SEX + predicted_component + dmt_active + lenalidomid + hma + transplant',
                   'AGE_AT_SAMPLE_TIME + SEX + IPSS_R + dmt_active + lenalidomid + hma + transplant',
                   'AGE_AT_SAMPLE_TIME + SEX + IPSS_R + predicted_component + dmt_active + lenalidomid + hma + transplant')) {
    
    formula <- sprintf('Surv(os_diag_years, os_status) ~ %s', features)
    cat(sprintf('Model: %s\n', formula))
    
    res <- coxph(as.formula(formula), data = dd_clinical)
    cat(sprintf('  ▴ Concordance: %.3f\n\n', res$concordance['concordance']))
}
Model: Surv(os_diag_years, os_status) ~ AGE_AT_SAMPLE_TIME + SEX + dmt_active + lenalidomid + hma + transplant
  ▴ Concordance: 0.609

Model: Surv(os_diag_years, os_status) ~ AGE_AT_SAMPLE_TIME + SEX + predicted_component + dmt_active + lenalidomid + hma + transplant
  ▴ Concordance: 0.702

Model: Surv(os_diag_years, os_status) ~ AGE_AT_SAMPLE_TIME + SEX + IPSS_R + dmt_active + lenalidomid + hma + transplant
  ▴ Concordance: 0.733

Model: Surv(os_diag_years, os_status) ~ AGE_AT_SAMPLE_TIME + SEX + IPSS_R + predicted_component + dmt_active + lenalidomid + hma + transplant
  ▴ Concordance: 0.746

Some detail results on the last set of features significance:

In [67]:
res <- coxph(Surv(os_diag_years, os_status) ~ AGE_AT_SAMPLE_TIME + SEX + IPSS_R + predicted_component + dmt_active + lenalidomid + hma + transplant, data = dd_clinical)
summary(res)

set_notebook_plot_size(8, 6)
ggforest(res, fontsize = 0.5)
Call:
coxph(formula = Surv(os_diag_years, os_status) ~ AGE_AT_SAMPLE_TIME + 
    SEX + IPSS_R + predicted_component + dmt_active + lenalidomid + 
    hma + transplant, data = dd_clinical)

  n= 2495, number of events= 1134 
   (805 observations deleted due to missingness)

                            coef exp(coef)  se(coef)      z Pr(>|z|)    
AGE_AT_SAMPLE_TIME      0.028949  1.029372  0.003291  8.796  < 2e-16 ***
SEXM                    0.231227  1.260146  0.064021  3.612 0.000304 ***
IPSS_Rint               0.453103  1.573186  0.085832  5.279 1.30e-07 ***
IPSS_Rpoor              0.947454  2.579136  0.096018  9.867  < 2e-16 ***
IPSS_Rvery_good        -0.470838  0.624479  0.116506 -4.041 5.32e-05 ***
IPSS_Rvery_poor         1.430524  4.180890  0.103763 13.786  < 2e-16 ***
predicted_component2    0.738750  2.093318  0.112301  6.578 4.76e-11 ***
predicted_component3    0.536538  1.710076  0.105752  5.074 3.90e-07 ***
predicted_component4   -0.164761  0.848096  0.111101 -1.483 0.138079    
predicted_component5    0.778740  2.178726  0.124441  6.258 3.90e-10 ***
predicted_component6   -0.032367  0.968151  0.135705 -0.239 0.811487    
predicted_component7    0.521346  1.684293  0.134042  3.889 0.000100 ***
predicted_component8    0.270040  1.310017  0.150685  1.792 0.073119 .  
predicted_component9    0.725433  2.065626  0.216309  3.354 0.000797 ***
predicted_component10   0.586215  1.797173  0.294396  1.991 0.046454 *  
predicted_component11   0.147411  1.158830  0.458527  0.321 0.747840    
predicted_componentNaN -0.208332  0.811937  0.214772 -0.970 0.332037    
dmt_activeyes           0.326444  1.386031  0.125711  2.597 0.009410 ** 
lenalidomid1           -0.170154  0.843535  0.205710 -0.827 0.408150    
hma1                   -0.224744  0.798721  0.121858 -1.844 0.065139 .  
transplant1            -0.771528  0.462306  0.132226 -5.835 5.38e-09 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

                       exp(coef) exp(-coef) lower .95 upper .95
AGE_AT_SAMPLE_TIME        1.0294     0.9715    1.0228    1.0360
SEXM                      1.2601     0.7936    1.1115    1.4286
IPSS_Rint                 1.5732     0.6357    1.3296    1.8614
IPSS_Rpoor                2.5791     0.3877    2.1367    3.1132
IPSS_Rvery_good           0.6245     1.6013    0.4970    0.7847
IPSS_Rvery_poor           4.1809     0.2392    3.4115    5.1238
predicted_component2      2.0933     0.4777    1.6797    2.6087
predicted_component3      1.7101     0.5848    1.3900    2.1039
predicted_component4      0.8481     1.1791    0.6821    1.0544
predicted_component5      2.1787     0.4590    1.7072    2.7805
predicted_component6      0.9682     1.0329    0.7420    1.2632
predicted_component7      1.6843     0.5937    1.2952    2.1904
predicted_component8      1.3100     0.7633    0.9750    1.7601
predicted_component9      2.0656     0.4841    1.3519    3.1563
predicted_component10     1.7972     0.5564    1.0093    3.2002
predicted_component11     1.1588     0.8629    0.4718    2.8465
predicted_componentNaN    0.8119     1.2316    0.5330    1.2369
dmt_activeyes             1.3860     0.7215    1.0833    1.7733
lenalidomid1              0.8435     1.1855    0.5636    1.2624
hma1                      0.7987     1.2520    0.6290    1.0142
transplant1               0.4623     2.1631    0.3568    0.5991

Concordance= 0.746  (se = 0.008 )
Rsquare= 0.266   (max possible= 0.998 )
Likelihood ratio test= 772.2  on 21 df,   p=<2e-16
Wald test            = 789.8  on 21 df,   p=<2e-16
Score (logrank) test = 907.8  on 21 df,   p=<2e-16
Warning message in .get_data(model, data = data):
“The `data` argument is not provided. Data will be extracted from model fit.”Warning message:
“Removed 7 rows containing missing values (geom_errorbar).”

Cox-lasso model

In [68]:
mean_imputation <- function(data, column_name) {
    # Proceed mean imputation on selected column
    # → Arguments
    #     - data
    #     - column_name: column on which to impute values
    
    cat(sprintf('Mean imputation for %s (%d NA values)\n', column_name, nrow(data[is.na(data[, column_name]),])))
    data[is.na(data[, column_name]), column_name] <- mean(data[, column_name], na.rm = TRUE)
    
    return (data)
}
In [69]:
# prepare data
data <- dd_clinical
data <- mean_imputation(data, 'AGE_AT_SAMPLE_TIME')

for (f in c('AGE_AT_SAMPLE_TIME', 'SEX', 'IPSS_R', 'predicted_component', 'dmt_active', 'lenalidomid', 'hma', 'transplant')) {
    cat(sprintf('Feature %-19s: removed %3d NA values\n', f, nrow(data[is.na(data[, f]),])))
    data <- data[! is.na(data[, f]),]
}

print_size(data)

y <- data[, c('os_diag_years', 'os_status')]
colnames(y) <- c('time', 'status')
x <- model.matrix(~ AGE_AT_SAMPLE_TIME + SEX + IPSS_R + predicted_component + dmt_active + lenalidomid + hma + transplant, data)
Mean imputation for AGE_AT_SAMPLE_TIME (20 NA values)
Feature AGE_AT_SAMPLE_TIME : removed   0 NA values
Feature SEX                : removed   0 NA values
Feature IPSS_R             : removed 780 NA values
Feature predicted_component: removed   0 NA values
Feature dmt_active         : removed  16 NA values
Feature lenalidomid        : removed   0 NA values
Feature hma                : removed   0 NA values
Feature transplant         : removed   0 NA values
Size of data: 2504 x 256
In [70]:
# remove NA status values and negative time values
rm_mask <- is.na(y[,'status']) | y[,'time'] <= 0
sum(rm_mask)

x <- x[! rm_mask,]
y <- y[! rm_mask,]
y <- as.matrix(y)
print_size(x)
print_size(y)
27
Size of x: 2477 x 22
Size of y: 2477 x 2

glmnet

  • $\alpha$ : elastic-net penalty ($\alpha=1$: lasso, $\alpha=0$: ridge)
  • $\lambda$: controls the overall strength of the penalty
  • fit: fitted model
  • plot(fit): each curve corresponds to a variable, it shows the path of its coefficient against the $\ell_1$-norm of the whole coefficient vector at as $\lambda$ varies. The axis above indicates the number of nonzero coefficients at the current $\lambda$, which is the effective degrees of freedom (df) for the lasso.
In [71]:
# fit one model
fit = glmnet(x, y, family = "cox")

# plot lasso path
set_notebook_plot_size(10, 10)
par(mar = c(4, 4, 1, 15))
plot(fit, label = TRUE)
vnat = coef(fit)
vnat = vnat[-1, ncol(vnat)] # remove intercept and get coefficients at the end of the path
axis(4, at = vnat, line = -.4, label = colnames(x)[-1], las = 1, tick = FALSE, cex.axis = 0.7)
  • print(fit): summary of path at each step with the following columns:
    • Df: number of nonzero coefficients
    • %Dev: percent of deviance explained
    • $\lambda$
In [72]:
print(fit)
Call:  glmnet(x = x, y = y, family = "cox") 

      Df     %Dev   Lambda
 [1,]  0 0.000000 0.187900
 [2,]  1 0.003768 0.171200
 [3,]  1 0.006466 0.156000
 [4,]  1 0.008464 0.142100
 [5,]  2 0.010280 0.129500
 [6,]  4 0.014560 0.118000
 [7,]  6 0.019020 0.107500
 [8,]  6 0.022850 0.097950
 [9,]  6 0.026000 0.089250
[10,]  6 0.028610 0.081320
[11,]  6 0.030770 0.074090
[12,]  7 0.032730 0.067510
[13,]  9 0.034760 0.061510
[14,] 11 0.036860 0.056050
[15,] 12 0.038710 0.051070
[16,] 12 0.040530 0.046530
[17,] 13 0.042040 0.042400
[18,] 13 0.043320 0.038630
[19,] 14 0.044390 0.035200
[20,] 16 0.045450 0.032070
[21,] 15 0.046360 0.029220
[22,] 15 0.047100 0.026630
[23,] 15 0.047710 0.024260
[24,] 17 0.048290 0.022110
[25,] 17 0.048830 0.020140
[26,] 17 0.049280 0.018350
[27,] 17 0.049650 0.016720
[28,] 18 0.049960 0.015240
[29,] 18 0.050240 0.013880
[30,] 18 0.050480 0.012650
[31,] 18 0.050680 0.011530
[32,] 18 0.050840 0.010500
[33,] 18 0.050980 0.009569
[34,] 18 0.051100 0.008719
[35,] 18 0.051190 0.007945
[36,] 18 0.051270 0.007239
[37,] 18 0.051340 0.006596
[38,] 18 0.051390 0.006010
[39,] 19 0.051440 0.005476
[40,] 19 0.051510 0.004990
[41,] 20 0.051570 0.004546
[42,] 20 0.051620 0.004142
[43,] 20 0.051660 0.003774
[44,] 20 0.051700 0.003439
[45,] 20 0.051730 0.003134
[46,] 21 0.051760 0.002855
[47,] 21 0.051780 0.002602
[48,] 21 0.051800 0.002370
[49,] 21 0.051820 0.002160
[50,] 21 0.051830 0.001968
[51,] 21 0.051840 0.001793
[52,] 21 0.051850 0.001634
  • cvfit: compute $k$-fold cross-validation
  • plot(cvfit): view the optimal $\lambda$ value and a cross validated error plot
    • left vertical line: where the CV-error curve hits its minimum
    • right vertical line: most regularized model with CV-error within 1 std dev of the minimum
In [73]:
# 10-folds cross validation to find lambda
cvfit = cv.glmnet(x, y, family = 'cox', alpha = 1, nfolds = 10, grouped = TRUE)

# plot results
set_notebook_plot_size(10, 6)
plot(cvfit)
In [74]:
cvfit$lambda.min
cvfit$lambda.1se
0.00237043621661312
0.0292238455387025
In [75]:
coef(cvfit, s = 'lambda.1se')
22 x 1 sparse Matrix of class "dgCMatrix"
                                 1
(Intercept)             .         
AGE_AT_SAMPLE_TIME      0.02089268
SEXM                    0.11275873
IPSS_Rint               0.20398802
IPSS_Rpoor              0.76639071
IPSS_Rvery_good        -0.39694028
IPSS_Rvery_poor         1.25601535
predicted_component2    0.40408349
predicted_component3    0.21462811
predicted_component4   -0.23347151
predicted_component5    0.37399522
predicted_component6   -0.05011404
predicted_component7    0.06984766
predicted_component8    .         
predicted_component9    .         
predicted_component10   .         
predicted_component11   .         
predicted_componentNaN -0.04734575
dmt_activeyes           0.03872877
lenalidomid1            .         
hma1                    .         
transplant1            -0.36691068
In [76]:
# format coef to an easy-to-use dataframe
coefs <- as.data.frame(as.matrix(coef(cvfit, s = 'lambda.1se')[-1,])) # remove intercept and convert to dataframe
colnames(coefs) <- 'coefficient_value'
coefs$feature_name <- rownames(coefs)

coefs$feature_group <- c('other', 'other', 'IPSSR', 'IPSSR', 'IPSSR', 'IPSSR', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'other', 'other', 'other', 'other')

rownames(coefs) <- NULL
coefs <- coefs[order(coefs$coefficient_value),]
coefs$feature_name <- factor(coefs$feature_name, levels = coefs$feature_name)
coefs
coefficient_valuefeature_namefeature_group
5-0.39694028 IPSS_Rvery_good IPSSR
21-0.36691068 transplant1 other
9-0.23347151 predicted_component4 component
11-0.05011404 predicted_component6 component
17-0.04734575 predicted_componentNaNcomponent
13 0.00000000 predicted_component8 component
14 0.00000000 predicted_component9 component
15 0.00000000 predicted_component10 component
16 0.00000000 predicted_component11 component
19 0.00000000 lenalidomid1 other
20 0.00000000 hma1 other
1 0.02089268 AGE_AT_SAMPLE_TIME other
18 0.03872877 dmt_activeyes other
12 0.06984766 predicted_component7 component
2 0.11275873 SEXM other
3 0.20398802 IPSS_Rint IPSSR
8 0.21462811 predicted_component3 component
10 0.37399522 predicted_component5 component
7 0.40408349 predicted_component2 component
4 0.76639071 IPSS_Rpoor IPSSR
6 1.25601535 IPSS_Rvery_poor IPSSR
In [77]:
ggplot(coefs) + geom_bar(aes(feature_name, coefficient_value, fill = feature_group), stat = 'identity') + coord_flip()

Let's run the experiment above 100 times with different random sampling of the data, and look at the probability of being null for each feature:

In [78]:
features_list <- data.frame(feature_name = colnames(x)[-1], count_not_null = 0)
features_list$feature_group <- c('other', 'other', 'IPSSR', 'IPSSR', 'IPSSR', 'IPSSR', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'other', 'other', 'other', 'other')

n_experiment <- 100

for (i in 1:n_experiment) {
    print_and_flush(sprintf('\rExperiment %d/%d...', i, n_experiment))
    
    # sample 80% of x and y with replacement
    set.seed(i * 100)
    sampling <- sample(1:nrow(x), 0.8 * nrow(x), replace = TRUE)
    
    cvfit <- cv.glmnet(x[sampling,], y[sampling,], family = 'cox', alpha = 1, nfolds = 10, grouped = TRUE)
    
    coefs <- as.data.frame(as.matrix(coef(cvfit, s = 'lambda.1se')[-1,])) # remove intercept and convert to dataframe
    colnames(coefs) <- 'coefficient_value'
    coefs$feature_name <- rownames(coefs)
    rownames(coefs) <- NULL
    coefs$not_null <- coefs$coefficient_value != 0

    features_list$count_not_null <- features_list$count_not_null + coefs$not_null
}
cat(' done!')

features_list$count_not_null <- features_list$count_not_null / n_experiment
features_list
Experiment 100/100... done!
feature_namecount_not_nullfeature_group
AGE_AT_SAMPLE_TIME 1.00 other
SEXM 0.88 other
IPSS_Rint 0.93 IPSSR
IPSS_Rpoor 1.00 IPSSR
IPSS_Rvery_good 1.00 IPSSR
IPSS_Rvery_poor 1.00 IPSSR
predicted_component2 1.00 component
predicted_component3 0.94 component
predicted_component4 1.00 component
predicted_component5 1.00 component
predicted_component6 0.72 component
predicted_component7 0.55 component
predicted_component8 0.12 component
predicted_component9 0.25 component
predicted_component10 0.25 component
predicted_component11 0.01 component
predicted_componentNaN0.61 component
dmt_activeyes 0.46 other
lenalidomid1 0.01 other
hma1 0.46 other
transplant1 1.00 other
In [79]:
# order
features_list <- features_list[order(features_list$count_not_null),]
features_list$feature_name <- factor(features_list$feature_name, levels = features_list$feature_name)

ggplot(features_list) + geom_bar(aes(feature_name, count_not_null, fill = feature_group), stat = 'identity') + coord_flip()

Adding gene and cytogenetics features

Let's make the same two plots but adding genes and cytogenetics features:

In [80]:
# prepare data
data <- dd_clinical
data <- mean_imputation(data, 'AGE_AT_SAMPLE_TIME')

for (f in c('AGE_AT_SAMPLE_TIME', 'SEX', 'IPSS_R', 'predicted_component', 'dmt_active', 'lenalidomid', 'hma', 'transplant')) {
    cat(sprintf('Feature %-19s: removed %3d NA values\n', f, nrow(data[is.na(data[, f]),])))
    data <- data[! is.na(data[, f]),]
}

print_size(data)

y <- data[, c('os_diag_years', 'os_status')]
colnames(y) <- c('time', 'status')
formula <- sprintf('~ AGE_AT_SAMPLE_TIME + SEX + IPSS_R + predicted_component + dmt_active + lenalidomid + hma + transplant + %s', paste(colnames(dd_all), collapse = ' + '))

x <- model.matrix(as.formula(formula), data)
Mean imputation for AGE_AT_SAMPLE_TIME (20 NA values)
Feature AGE_AT_SAMPLE_TIME : removed   0 NA values
Feature SEX                : removed   0 NA values
Feature IPSS_R             : removed 780 NA values
Feature predicted_component: removed   0 NA values
Feature dmt_active         : removed  16 NA values
Feature lenalidomid        : removed   0 NA values
Feature hma                : removed   0 NA values
Feature transplant         : removed   0 NA values
Size of data: 2504 x 256
In [81]:
rm_mask <- is.na(y[,'status']) | y[,'time'] <= 0
sum(rm_mask)

x <- x[! rm_mask,]
y <- y[! rm_mask,]
y <- as.matrix(y)
print_size(x)
print_size(y)
27
Size of x: 2477 x 179
Size of y: 2477 x 2
In [82]:
# fit one model
fit = glmnet(x, y, family = "cox")

# plot lasso path
set_notebook_plot_size(20, 10)
plot(fit)
In [83]:
# 10-folds cross validation to find lambda
cvfit = cv.glmnet(x, y, family = 'cox', alpha = 1, nfolds = 10, grouped = TRUE)

# plot results
set_notebook_plot_size(10, 6)
plot(cvfit)
In [84]:
cvfit$lambda.min
cvfit$lambda.1se
0.018353426711371
0.0320731343989067
In [85]:
# format coef to an easy-to-use dataframe
coefs <- as.data.frame(as.matrix(coef(cvfit, s = 'lambda.1se')[-1,])) # remove intercept and convert to dataframe
colnames(coefs) <- 'coefficient_value'
coefs$feature_name <- rownames(coefs)
coefs$feature_group <- c('other', 'other', 'IPSSR', 'IPSSR', 'IPSSR', 'IPSSR', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'other', 'other', 'other', 'other',
                         rep('gene', 107), rep('cyto', 50))

rownames(coefs) <- NULL
coefs <- coefs[order(coefs$coefficient_value),]
coefs$feature_name <- factor(coefs$feature_name, levels = coefs$feature_name)
In [86]:
set_notebook_plot_size(10, 25)
ggplot(coefs) + geom_bar(aes(feature_name, coefficient_value, fill = feature_group), stat = 'identity') + coord_flip()
In [87]:
features_list <- data.frame(feature_name = colnames(x)[-1], count_not_null = 0)
features_list$feature_group <- c('other', 'other', 'IPSSR', 'IPSSR', 'IPSSR', 'IPSSR', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'component', 'other', 'other', 'other', 'other',
                         rep('gene', 107), rep('cyto', 50))

n_experiment <- 100

for (i in 1:n_experiment) {
    print_and_flush(sprintf('\rExperiment %d/%d...', i, n_experiment))
    
    # sample 80% of x and y with replacement
    set.seed(i * 100)
    sampling <- sample(1:nrow(x), 0.8 * nrow(x), replace = TRUE)
    
    cvfit <- cv.glmnet(x[sampling,], y[sampling,], family = 'cox', alpha = 1, nfolds = 10, grouped = TRUE)
    
    coefs <- as.data.frame(as.matrix(coef(cvfit, s = 'lambda.1se')[-1,])) # remove intercept and convert to dataframe
    colnames(coefs) <- 'coefficient_value'
    coefs$feature_name <- rownames(coefs)
    rownames(coefs) <- NULL
    
    coefs$not_null <- coefs$coefficient_value != 0
    
    features_list$count_not_null <- features_list$count_not_null + coefs$not_null
}
cat(' done!')

features_list$count_not_null <- features_list$count_not_null / n_experiment
Experiment 100/100... done!
In [88]:
# order
features_list <- features_list[order(features_list$count_not_null),]
features_list$feature_name <- factor(features_list$feature_name, levels = features_list$feature_name)

ggplot(features_list) + geom_bar(aes(feature_name, count_not_null, fill = feature_group), stat = 'identity') + coord_flip()
In [90]:
set_notebook_plot_size(7, 4)

mask          <- (dd_clinical$MYC == 1)
wildtype_mask <- ! mask

plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
                                                      mask = mask,
                                                      wildtype_mask = wildtype_mask,
                                                      stratifier_column_name = '',
                                                      legend_prefix = 'has_MYC',
                                                      colors = get_colors(1, 'grey'),
                                                      wildtype_name = 'wildtype',
                                                      AML = TRUE)

plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
                                                      mask = mask,
                                                      wildtype_mask = wildtype_mask,
                                                      stratifier_column_name = '',
                                                      legend_prefix = 'has_MYC',
                                                      colors = get_colors(1, 'grey'),
                                                      wildtype_name = 'wildtype',
                                                      AML = FALSE)
In [91]:
set_notebook_plot_size(7, 4)

mask          <- (dd_clinical$plus13 == 1)
wildtype_mask <- ! mask

plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
                                                      mask = mask,
                                                      wildtype_mask = wildtype_mask,
                                                      stratifier_column_name = '',
                                                      legend_prefix = 'has_plus13',
                                                      colors = get_colors(1, 'grey'),
                                                      wildtype_name = 'wildtype',
                                                      AML = TRUE)

plot_survival_curve_stratified_by_categorical_feature(dd_clinical,
                                                      mask = mask,
                                                      wildtype_mask = wildtype_mask,
                                                      stratifier_column_name = '',
                                                      legend_prefix = 'has_plus13',
                                                      colors = get_colors(1, 'grey'),
                                                      wildtype_name = 'wildtype',
                                                      AML = FALSE)